On of the most known measures for ranking countries is HDI - Human Development Index. It takes into account: life expectancy at birth (in years), expected years of schooling, mean years of schooling and gross national income per capita. In this report we decided to choose top 24 countries from Latest Human Development Index Ranking and take a closer look at additional information from World Development Indicators in their context. Moreover, we tried to find correlations between Bitcoins’ data from 2009 to 2021, S.P.Composite dataset and some of the mentioned countries. Finally, short-term prediction for Bitcoin Market Price USD was made.
Top 24 countries according to http://hdr.undp.org/en/content/latest-human-development-index-ranking. Because Liechtenstein had many missing values in WDI dataset, we decided to change it to the next top country in the index - France.
hdiTop <- c('Norway', 'Ireland', 'Switzerland', 'Iceland', 'Hong Kong', 'Germany', 'Sweden', 'Netherlands', 'Australia', 'Denmark', 'Finland', 'Singapore', 'United Kingdom', 'New Zealand', 'Belgium', 'Canada', 'United States', 'Austria', 'Israel', 'Japan', 'Slovenia', 'Luxembourg', 'Korea, Rep.', 'Spain', 'France')
During generation of this report we used the following libraries:
libraries
## [1] "dplyr" "tidyr" "ggplot2" "readxl" "imputeTS" "vtable"
## [7] "skimr" "DT" "caret" "gganimate" "ggpubr"
currExchRates <- read.csv("./datasets/CurrencyExchangeRates.csv")
goldPrices <- read.csv("./datasets/Gold prices.csv")
spComposite <- read.csv("./datasets/S&P Composite.csv")
worldDevInd <- read_excel("./datasets/World_Development_Indicators.xlsx", trim_ws=TRUE, na = "..")
worldDevInd <- data.frame(worldDevInd)
btcMetadata <- read.csv("./datasets/BCHAIN_metadata.csv")
btcDiff <- read.csv("./datasets/BCHAIN-DIFF.csv")
btcHrate <- read.csv("./datasets/BCHAIN-HRATE.csv")
btcMkpru <- read.csv("./datasets/BCHAIN-MKPRU.csv")
btcTrvou <- read.csv("./datasets/BCHAIN-TRVOU.csv")
In order to clean the Currency Exchange Rates Dataset we inserted 0’s in Euro currency before ’98 and used special function to fill empty individual places by making mean from adjacent values.
CurrExchRatesClean <- currExchRates %>% select(which(colMeans(is.na(.)) < 0.2))
CurrExchRatesClean <- CurrExchRatesClean %>% mutate(Euro = ifelse(Date < as.Date("1998-10-30", format="%Y-%m-%d"), 0, Euro))
CurrExchRatesClean <- CurrExchRatesClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
CurrExchRatesClean <- na_ma(CurrExchRatesClean, k = 1)
st(CurrExchRatesClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| Australian.Dollar | 5978 | 0.768 | 0.143 | 0.483 | 0.665 | 0.869 | 1.105 |
| Bahrain.Dinar | 5978 | 0.376 | 0 | 0.376 | 0.376 | 0.376 | 0.376 |
| Brazilian.Real | 5978 | 2.158 | 0.765 | 0.832 | 1.72 | 2.717 | 4.195 |
| Canadian.Dollar | 5978 | 1.266 | 0.187 | 0.917 | 1.086 | 1.403 | 1.613 |
| Colombian.Peso | 5978 | 2082.838 | 589.734 | 833.18 | 1795.37 | 2450.322 | 3434.89 |
| Danish.Krone | 5978 | 6.282 | 0.912 | 4.665 | 5.612 | 6.804 | 9.006 |
| Euro | 5978 | 1.007 | 0.476 | 0 | 0.938 | 1.314 | 1.599 |
| Icelandic.Krona | 5978 | 92.418 | 24.739 | 54.72 | 70.42 | 116.888 | 147.98 |
| Indian.Rupee | 5978 | 48.181 | 9.451 | 31.37 | 43.133 | 52.967 | 68.778 |
| Japanese.Yen | 5978 | 107.974 | 14.065 | 75.86 | 100.765 | 118.395 | 147 |
| Korean.Won | 5978 | 1103.062 | 160.752 | 756 | 1016.675 | 1186.8 | 1964.8 |
| Kuwaiti.Dinar | 5978 | 0.294 | 0.011 | 0.265 | 0.287 | 0.303 | 0.309 |
| Libyan.Dinar | 5978 | 1.507 | 0.61 | 0.525 | 0.662 | 1.932 | 1.932 |
| Malaysian.Ringgit | 5978 | 3.504 | 0.491 | 2.436 | 3.182 | 3.8 | 4.725 |
| Nepalese.Rupee | 5978 | 77.184 | 15.094 | 49.88 | 68.475 | 84.698 | 109.98 |
| New.Zealand.Dollar | 5978 | 0.66 | 0.118 | 0.393 | 0.579 | 0.736 | 0.882 |
| Norwegian.Krone | 5978 | 6.964 | 1.087 | 4.959 | 6.107 | 7.803 | 9.606 |
| Pakistani.Rupee | 5978 | 70.139 | 23.441 | 30.877 | 51.866 | 90.937 | 115.697 |
| Qatar.Riyal | 5978 | 3.64 | 0 | 3.64 | 3.64 | 3.64 | 3.64 |
| Rial.Omani | 5978 | 0.384 | 0 | 0.384 | 0.384 | 0.384 | 0.385 |
| Saudi.Arabian.Riyal | 5978 | 3.749 | 0.002 | 3.745 | 3.745 | 3.75 | 3.75 |
| Singapore.Dollar | 5978 | 1.503 | 0.183 | 1.201 | 1.361 | 1.688 | 1.851 |
| South.African.Rand | 5978 | 8.076 | 2.849 | 3.53 | 6.218 | 9.941 | 16.771 |
| Sri.Lanka.Rupee | 5978 | 102.232 | 28.663 | 49.57 | 77.54 | 126.29 | 157.646 |
| Swedish.Krona | 5978 | 7.731 | 1.084 | 5.843 | 6.832 | 8.376 | 10.995 |
| Swiss.Franc | 5978 | 1.211 | 0.249 | 0.725 | 0.979 | 1.389 | 1.823 |
| Thai.Baht | 5978 | 35.229 | 5.531 | 24.44 | 31.568 | 39.447 | 56.061 |
| Trinidad.And.Tobago.Dollar | 5978 | 6.309 | 0.18 | 5.839 | 6.26 | 6.376 | 6.789 |
| U.A.E..Dirham | 5978 | 3.672 | 0.001 | 3.671 | 3.672 | 3.672 | 3.675 |
| U.K..Pound.Sterling | 5978 | 1.615 | 0.174 | 1.213 | 1.518 | 1.675 | 2.102 |
| U.S..Dollar | 5978 | 1 | 0 | 1 | 1 | 1 | 1 |
goldPricesClean <- goldPrices %>% mutate(
EURO..AM. = ifelse(Date < as.Date("1998-12-31", format="%Y-%m-%d"), 0, EURO..AM.),
EURO..PM. = ifelse(Date < as.Date("1998-12-31", format="%Y-%m-%d"), 0, EURO..PM.),
USD..PM. = ifelse(Date < as.Date("1968-03-14", format="%Y-%m-%d"), 0, USD..PM.),
GBP..PM. = ifelse(Date < as.Date("1968-03-14", format="%Y-%m-%d"), 0, GBP..PM.))
goldPricesClean <- goldPricesClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
goldPricesClean <- na_ma(goldPricesClean, k = 1)
st(goldPricesClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| USD..AM. | 13585 | 575.166 | 490.165 | 34.77 | 280.5 | 841.75 | 2061.5 |
| USD..PM. | 13585 | 574.853 | 490.136 | 0 | 280.3 | 841.1 | 2067.15 |
| GBP..AM. | 13585 | 370.554 | 353.367 | 14.485 | 177.63 | 453.906 | 1574.37 |
| GBP..PM. | 13585 | 370.381 | 353.32 | 0 | 177.615 | 453.787 | 1569.59 |
| EURO..AM. | 13585 | 337.372 | 477.165 | 0 | 0 | 576.189 | 1743.82 |
| EURO..PM. | 13585 | 337.309 | 477.073 | 0 | 0 | 576.776 | 1743.43 |
World Development Indicators’ Dataset required more transformations than the rest. After creating the Years column top HDI countries for our analysis were filtered. We selected only these indicators which had less than 10% of NA’s for filtered countries. The rest of missing values were filled with 0’s.
worldDevInd$Series.Name<-gsub("\\$","D",worldDevInd$Series.Name)
worldDevIndClean <- worldDevInd %>% select(-Series.Code) %>%
pivot_longer('X1970..YR1970.':'X2020..YR2020.', names_to = "Year", values_to = "Value", values_drop_na = FALSE) %>%
pivot_wider(names_from = "Series.Name", values_from = Value) %>% unchop(everything())
worldDevIndClean <- worldDevIndClean %>%
mutate(Year = substr(Year, 2, 5)) %>%
filter(grepl('', Country.Code)) %>%
relocate(Year, .after = Country.Code)
trial <- as.data.frame(worldDevIndClean)
trial <- worldDevIndClean %>% filter(Country.Name %in% hdiTop)
trial <- trial %>% select(which(colMeans(is.na(.)) < 0.10))
trial[is.na(trial)] <- 0
wdiTop <- trial
datatable(skim(wdiTop %>% dplyr::select(-Country.Name, -Country.Code, -Year)) %>% dplyr::select(skim_variable, numeric.mean, numeric.sd, numeric.p0, numeric.p25, numeric.p75, numeric.p100) %>% dplyr::rename(Variable=skim_variable, Mean=numeric.mean, "Std. Dev."=numeric.sd, Min=numeric.p0, "Pctl. 25"=numeric.p25, "Pctl. 75"=numeric.p75, Max=numeric.p100))
spCompositeClean <- spComposite %>%
mutate(Cyclically.Adjusted.PE.Ratio = ifelse(Year < as.Date("1880-12-31", format="%Y-%m-%d"), 0, Cyclically.Adjusted.PE.Ratio)) %>% rename(Date = Year)
spCompositeClean <- spCompositeClean%>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
spCompositeClean <- na_ma(spCompositeClean, k = 1)
st(spComposite)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| S.P.Composite | 1810 | 327.968 | 685.029 | 2.73 | 7.903 | 164.4 | 4493.28 |
| Dividend | 1806 | 6.732 | 12.069 | 0.18 | 0.42 | 7.053 | 59.68 |
| Earnings | 1806 | 15.371 | 28.933 | 0.16 | 0.561 | 14.726 | 158.74 |
| CPI | 1810 | 62.392 | 76.26 | 6.28 | 10.2 | 102.275 | 273.983 |
| Long.Interest.Rate | 1810 | 4.504 | 2.306 | 0.62 | 3.171 | 5.139 | 15.32 |
| Real.Price | 1810 | 621.987 | 733.091 | 73.899 | 186.575 | 706.991 | 4477.204 |
| Real.Dividend | 1806 | 17.498 | 11.114 | 5.445 | 9.417 | 22.301 | 63.511 |
| Real.Earnings | 1806 | 34.907 | 29.841 | 4.576 | 14.063 | 43.768 | 159.504 |
| Cyclically.Adjusted.PE.Ratio | 1690 | 17.215 | 7.048 | 4.784 | 11.898 | 20.913 | 44.198 |
We read four datasets containing information about Bitcoins: Trvou - Bitcoin USD Exchange Trade Volume, Mkpru - Bitcoin Market Price USD, Diff - Bitcoin Difficulty, Hrate - Bitcoin Hash Rate, and decided to gather them into one large set:
btcTrvou <- btcTrvou %>% rename(Trvou = Value)
btcMkpru <- btcMkpru %>% rename(Mkpru = Value)
btcDiff <- btcDiff %>% rename(Diff = Value)
btcHrate <- btcHrate %>% rename(Hrate = Value)
btcAllClean <- btcTrvou %>% left_join(btcMkpru) %>% left_join(btcHrate) %>% left_join(btcDiff)
btcAllClean <- btcAllClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
btcAllClean <- na_ma(btcAllClean, k = 1)
st(btcAllClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| Trvou | 4661 | 146728050.012 | 330500192.236 | 0 | 194787.108 | 148443297.885 | 5352015515.54 |
| Mkpru | 4661 | 5141.227 | 10990.321 | 0 | 7.2 | 6499.062 | 63554.44 |
| Hrate | 4661 | 26480643.671 | 46431207.364 | 0 | 12.016 | 38267606.403 | 198514005.71 |
| Diff | 4661 | 3667535953756.88 | 6452342784118.23 | 0 | 1689334.405 | 5363678461480 | 25046487590083 |
In this section we focus mainly on indicators for chosen countries. Later we presented overviews for Bitcoins, Gold Prices and animations for S.P. Composite.
We focused on indicators from 2019 because the HDI rank included HDI values from 2019.
btcAnalysis <- btcAllClean %>% pivot_longer(cols='Trvou':'Diff', names_to = 'Param', values_to = 'Value')
ggplot(btcAnalysis, aes(x=Date, y=Value, color = as.factor(Param))) + geom_line() + facet_wrap(~Param, scales = "free_y") + theme_minimal() + labs(colour = "Bitcoin's Parameter", title = "Bitcoin's Overview")
GoldAnalysis <- goldPricesClean %>% pivot_longer(cols = 'USD..AM.':'EURO..PM.', names_to = 'Curr', values_to = 'Value')
ggplot(data = GoldAnalysis, aes(x = Date, y = Value, color=as.factor(Curr))) +
geom_point(size=0.1) + facet_wrap(~Curr) + theme_minimal() + labs(color='Currency', title='Gold Prices\' Overview')
ggplot(data = spCompositeClean, aes(x = Date, y = S.P.Composite)) +
geom_line(color='blue') + theme_minimal() + labs(title='S.P.Composite through Years') + transition_reveal(Date)
In this section we tried to find correlations using different datasets in order to make better predictions for Bitcoins later in the report. Firstly, some attempts were made with only Bitcoins’ data. Then we used S.P. Composite but this significantly limited our possible training and testing sets thus we decided to not use this in predictions. Finally, we looked for correlations between bitcoins and some indicators for top HDI countries. What is important - Bitcoins’ dataset was based on days and WDI dataset was based on years. To compare values we calculated means for Bitcoins’ years. However, no important correlation was there found.
btcPlusSP <- btcAllClean %>% inner_join(spCompositeClean, on='Date')
## Joining, by = "Date"
btcPlusGold <- btcAllClean %>% inner_join(goldPricesClean, on='Date')
## Joining, by = "Date"
ggscatter(btcAllClean, x = "Mkpru", y = "Hrate",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Hrate", color = 'plum4')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcAllClean, x = "Mkpru", y = "Diff",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Diff", color = 'plum3')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcAllClean, x = "Mkpru", y = "Trvou",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Trvou", color = 'plum2')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "S.P.Composite",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "S.P.Composite", color='maroon2')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "CPI",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "CPI", color='indianred3')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "Earnings",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Earnings", color='coral2')
## `geom_smooth()` using formula 'y ~ x'
btcYearsSummary <- btcAllClean %>% mutate(Year = format(as.Date(Date, format="%d/%m/%Y"),"%Y")) %>% select(-Date) %>% group_by(Year) %>% dplyr::summarize(MeanDiff = mean(Diff), MeanHrate=mean(Hrate), MeanMkpru=mean(Mkpru), MeanTrvou=mean(Trvou)) %>% filter(Year > 2008 & Year <2021)
btcPlusCountry <- btcYearsSummary %>% inner_join(wdiTop, on='Year')
## Joining, by = "Year"
for (country in hdiTop) {
countryExample <- btcPlusCountry %>% filter(Country.Name == country)
check <- cor(x = countryExample$MeanMkpru, y = countryExample$`GDP per capita (current USD)`, method = "pearson", use = "everything")
if (check < 0.7 | is.na(check)) {next}
print(ggscatter(countryExample, x = "MeanMkpru", y = "GDP per capita (current USD)",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "MeanMkpru", ylab = "GDP per capita (current USD)", color='orchid2', title = country))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
for (country in hdiTop) {
countryExample <- btcPlusCountry %>% filter(Country.Name == country)
check <- cor(x = countryExample$MeanMkpru, y = countryExample$`Urban population growth (annual %)`, method = "pearson", use = "everything")
if (check < 0.7 | is.na(check)) {next}
print(ggscatter(countryExample, x = "MeanMkpru", y = "Urban population growth (annual %)",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "MeanMkpru", ylab = "Urban population growth (annual %)", color='maroon1', title = country))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
set.seed(23)
btcAllClean2 <- btcAllClean %>% select(-Date)
inTraining <- createDataPartition(y = btcAllClean2$Mkpru, p = .70, list = FALSE)
train <- btcAllClean2[inTraining,]
valid_and_test <- btcAllClean2[-inTraining,]
inTesting <- createDataPartition(y = valid_and_test$Mkpru, p = .50, list = FALSE)
valid <- valid_and_test[ -inTesting,]
test <- valid_and_test[ inTesting,]
ctrl <- trainControl(
method = "repeatedcv",
number = 2,
repeats = 5)
fit <- train(Mkpru ~ .,
data = train,
method = "rf",
trControl = ctrl,
ntree = 10)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
fit
## Random Forest
##
## 3265 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 1633, 1632, 1633, 1632, 1632, 1633, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2888.579 0.9342471 853.6425
## 3 2783.717 0.9393712 762.8295
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
pred <- predict(fit, newdata = test)
preddf <- data.frame(Pred = pred, Mkpru = test$Mkpru)
datatable(preddf)
print(ggscatter(preddf, x = "Mkpru", y = "Pred",
xlab = "Mkpru", ylab = "Predicted Mkpru", color='maroon1') + geom_smooth())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'